import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
sns.set_theme(style="whitegrid")
# log10(flow) ~ prop_users_dest + prop_users_orig + query_date + log10(distance) + indicator_vars
fit_df = pd.read_csv("~/Nextcloud/linkedin_recruiter/outputs/cohen_model_output_2021-02-15.csv")
# df = pd.read_csv("N:/johnson/linkedin_recruiter/outputs/model_output_2021-02-09.csv")
# residuals vs. log10(distance)
fig = px.scatter(
fit_df.sort_values(by='query_date'), x=np.log10(fit_df['distance']), y='resids', color='dest_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'],
labels={'x': 'log10(distance [km])'})
fig.add_hline(y=0)
fig.show()
# residuals vs. proportion of linkedin users in destination
fig = px.scatter(fit_df.sort_values(by='query_date'), x='prop_users_dest', y='resids', color='dest_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'])
fig.add_hline(y=0)
fig.show()
# residuals vs. proportion of linkedin users in origin
fig = px.scatter(fit_df.sort_values(by='query_date'), x='prop_users_orig', y='resids', color='orig_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'])
fig.add_hline(y=0)
fig.show()
# residuals vs. predictions
fig = px.scatter(
fit_df.sort_values(by='query_date'), x='preds', y=fit_df['resids'].abs(), color='orig_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'],
labels = {'y': 'abs(residuals)'})
fig.show()
# standardized residuals, "number of standard errors away from regression line"
# quantify how large the residuals are in standard deviation units
# residual = (obs - preds)
# residuals / standard deviation
# abs(std resid) > 3 is sometimes a threshold for outlier detection
# first, very positive residuals (underestimate of y)
fit_df.loc[fit_df['sresids'].abs() > 3].sort_values(
by='sresids', ascending=False
)[['country_orig', 'country_dest', 'bin_maxgdp_orig', 'bin_maxgdp_dest']].drop_duplicates().head(10)
| country_orig | country_dest | bin_maxgdp_orig | bin_maxgdp_dest | |
|---|---|---|---|---|
| 20199 | Netherlands | Curacao | High | NaN |
| 29542 | Spain | Grenada | Middle | NaN |
| 22201 | France | New Caledonia | Middle-high | NaN |
| 25122 | France | French Guiana | Middle-high | NaN |
| 19156 | Netherlands | Suriname | High | NaN |
| 18774 | Netherlands | Aruba | High | NaN |
| 25609 | France | Reunion | Middle-high | NaN |
| 17340 | Brazil | Portugal | Middle | Middle-high |
| 20275 | France | French Polynesia | Middle-high | NaN |
| 25533 | France | Mayotte | Middle-high | NaN |
# and the very negative residuals (overestimate of y)
fit_df.loc[fit_df['sresids'].abs() > 3].sort_values(by='sresids')[['country_orig', 'country_dest', 'bin_maxgdp_orig', 'bin_maxgdp_dest']].drop_duplicates().head(10)
| country_orig | country_dest | bin_maxgdp_orig | bin_maxgdp_dest | |
|---|---|---|---|---|
| 17950 | Jamaica | Cuba | Low-middle | Low-middle |
| 25547 | Kenya | Mayotte | Low | NaN |
| 25611 | South Africa | Reunion | Low-middle | NaN |
| 17441 | Venezuela | Saint Lucia | Middle | Middle |
| 31705 | Jamaica | Dominican Republic | Low-middle | Middle |
| 9887 | Senegal | Sierra Leone | Low | Low |
| 22264 | Fiji | New Caledonia | NaN | NaN |
| 20334 | New Zealand | French Polynesia | Middle-high | NaN |
| 2715 | Kenya | South Sudan | Low | NaN |
| 36083 | Saudi Arabia | Egypt | High | Middle |
# number of predictors
P = 390
# number of observations
n = len(fit_df)
# hat-values, commonly used to measure leverage
hat_threshold = (2 * (P + 1))/n
print(hat_threshold)
# "influential" country pairs
fit_df.loc[fit_df['hat_values'] > hat_threshold].sort_values(
by='hat_values', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(10)
0.01928483353884094
| country_orig | country_dest | |
|---|---|---|
| 1524 | Mongolia | Kyrgyzstan |
| 21713 | Iceland | Zimbabwe |
| 16143 | Guinea-Bissau | Cape Verde |
| 40349 | Isle of Man | Gibraltar |
| 39335 | Gibraltar | Malta |
| 38205 | Gibraltar | Isle of Man |
| 20246 | Aruba | Curacao |
| 40469 | Nauru | Tonga |
| 17715 | Montserrat | Anguilla |
| 16408 | Cape Verde | Guinea-Bissau |
# Cook's Distance is a combination of standardized residuals & leverage
cooks_threshold = 4 / (n - P - 1)
print(cooks_threshold)
# "influential" country pairs
fit_df.loc[fit_df['cooks_dist'] > cooks_threshold].sort_values(
by='cooks_dist', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(10)
9.960407380661868e-05
| country_orig | country_dest | |
|---|---|---|
| 1524 | Mongolia | Kyrgyzstan |
| 25609 | France | Reunion |
| 20200 | Netherlands | Curacao |
| 22196 | France | New Caledonia |
| 11432 | Cambodia | Vietnam |
| 6990 | Tajikistan | Afghanistan |
| 25533 | France | Mayotte |
| 10440 | Cambodia | Thailand |
| 29542 | Spain | Grenada |
| 25123 | France | French Guiana |
# using standard residuals, cook's distance, & hat values
fig = px.scatter(
fit_df.sort_values(by='query_date'), x='hat_values', y='sresids', size=np.sqrt(fit_df['cooks_dist']*10),
color='orig_reg', facet_col='query_date', facet_col_wrap=3,
hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'hat values', 'sresids': 'std. residuals'})
fig.show()
# let's zoom in on one date
sub_df = fit_df.query("query_date == '2020-11-19'")
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='orig_reg', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()
sub_df = fit_df.query("query_date == '2020-10-08'").sort_values(by='maxhdi_orig')
sub_df['bin_maxhdi_orig'] = sub_df['bin_maxhdi_orig'].fillna('Null')
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='maxhdi_orig', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()
sub_df = fit_df.query("query_date == '2020-10-08'").sort_values(by='maxhdi_dest')
sub_df['bin_maxhdi_dest'] = sub_df['bin_maxhdi_dest'].fillna('Null')
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='maxhdi_dest', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()